Required Packages

library(tidyverse)
library(RColorBrewer)
library(readxl)
library(minfi)
library(ggplot2)
library(ENmix)
library(stringr)
library(geneplotter)
library(ChAMP)
library(wateRmelon)
library(sva)
library(pdftools)
library(filesstrings)
library(Harman)
library(DMRcate)

knitr::opts_chunk$set(echo = FALSE, warning = FALSE, message = FALSE)

Load Data

First, we load the PBMCs sample metadata. Note that the orignal sample metadata had a number of errors: there was a duplication of sample IDs.

# Get the sample info. Note that the top-level file for this R project is a
# clone of the "Superfund-Methylatio" GitHub repository. The data is stored in a
# copy of the "Arsenic Epigeenetics META" repository, locally named
# "arsenic-epigenetics-meta".
sample_metadata <- read_xlsx(
  path = paste0(
    "../../arsenic-epigenetics-meta/Chile/20170212_AC2013_DNAmethArray_Buccal",
    "_PBMCs_MetaData.xlsx"),
  sheet = "AC2013_DNAmeth_PL1_PBMCs"
) %>%
  mutate(
    position = `Position (on FINAL PLATE)`,
    sample_id = `Sample ID`,
    pbmc_sample_id = `PBMC ID`,
    study_id = `Study_ID...5`,
    exposed = as.factor(if_else(`Exposed: Yes:1; No:2` == 1, "yes", "no")),
    age = P3Q8_Age,
    sex = as.factor(if_else(`P3Q9_Sex_(M=1,F=2)` == 1, "M", "F")),
    smoking = as.factor(if_else(P38Q96_Smoking == 1, "yes", "no")),
    secondhand_smoking = as.factor(
      if_else(P40Q116_Smoking_Secondhand_for_non_smokers == "1", "yes", "no")
    ),
    diabetes = as.factor(if_else(P11Q35_Diabetes == 1, "yes", "no")),
    skin_cancer = as.factor(if_else(P10Q33_Cancer_Skin == 1, "yes", "no")),
    hypertension = as.factor(if_else(P12Q37_Hypertension == 1, "yes", "no")),
    cardio = as.factor(if_else(P13Q38_Cardio == 1, "yes", "no")),
    breastfed = as.factor(if_else(P26Q63_Breastfed == "1", "yes", "no")),
    gastric = as.factor(if_else(P16Q43_Gastric == 1, "yes", "no")),
    tuberculosis = as.factor(if_else(P15Q42_Tuberculosis == 1, "yes", "no")),
    intestinal = as.factor(if_else(P17Q45_Intestinal == 1, "yes", "no")),
    childhood_diseases = as.factor(
      if_else(P20Q50_Childhood_diseases == 1, "yes", "no")
    ),
    anemia = as.factor(if_else(P18Q47_Anemia == 1, "yes", "no")),
    immune = as.factor(if_else(P19Q48_Immune == 1, "yes", "no")),
    antibiotics = as.factor(
      if_else(`P22Q54_Antibiotics_(Yes=1,No=2)` == 1, "yes", "no")
    ),
    immunosuppresor = as.factor(
      if_else(`P24Q58_Immuosupresores_(Yes=1,No=2)` == 1, "yes", "no")
    ),
    laxatives = as.factor(
      if_else(`P25Q62_laxatives_(Yes=1,No=2)` == 1, "yes", "no")
    ),
    med_high_triglycerides = as.factor(
      if_else(Med_High_triglycerides == 1, "yes", "no")
    ),
    med_insulin_resistance = as.factor(
      if_else(Med_Insulin_resistance == 1, "yes", "no")
    ),
    med_hypertension = as.factor(if_else(Med_Hypertension == 1, "yes", "no")),
    med_diabetes = as.factor(if_else(Med_Diabetes == 1, "yes", "no")),
    med_high_cholesterol = as.factor(
      if_else(Med_High_Cholesterol == 1, "yes", "no")
    ),
    med_anticoagulant = as.factor(if_else(Med_anticoagulant == 1, "yes", "no")),
    weight_20 = P4Q12a_Weight20,
    weight_40 = P4Q12b_Weight40
  ) %>%
  dplyr::select(
    position, sample_id, pbmc_sample_id, study_id, exposed, age, sex, smoking,
    secondhand_smoking, diabetes, skin_cancer, hypertension, cardio, breastfed,
    gastric, tuberculosis, intestinal, childhood_diseases, anemia, immune,
    antibiotics, immunosuppresor, laxatives, med_high_triglycerides,
    med_insulin_resistance, med_hypertension, med_diabetes,
    med_high_cholesterol, med_anticoagulant, weight_20, weight_40
  ) %>%
  filter(
    sample_id != "QC", pbmc_sample_id != "BLANK"
  )

# assign duplicates
sample_metadata <- sample_metadata %>%
  mutate(
    duplicates = if_else(sample_id %in% c("P17", "P29"), 1,
                   if_else(sample_id %in% c("P3", "P46"), 2,
                     if_else(sample_id %in% c("P5", "P10"), 3,
                       if_else(sample_id %in% c("P4", "P15"), 4, 0))))
  )

# now load the sentrix id and position data
sample_info <- read_csv(
  "../../arsenic-epigenetics-meta/Chile/Sample_Info-Berkeley.csv"
) %>%
  dplyr::select(
    Sample.ID, Sample_Well, Sentrix_ID, Sentrix_Position
  ) %>%
  mutate(
    sentrix_full = paste(Sentrix_ID, Sentrix_Position, sep = "_"),
    sample_id = Sample.ID,
    position = Sample_Well
  ) %>%
  dplyr::select(
    sample_id, position, sentrix_full
  ) %>%
  filter(
    sample_id %in% paste0("P", 1:47)
  )
 
# create a vector of idat filenames
sample_info$Basename <- paste0(
  paste0("/Users/philippe/Documents/berkeley/",
         "superfund/arsenic-epigenetics-meta/Chile/idat/"),
  sample_info$sentrix_full
)

# include the base idat filenames to the sample metadata
sample_info <- sample_info %>%
  dplyr::select(position, Basename)
sample_metadata <- sample_metadata %>%
  left_join(sample_info, by = "position")

With the meta data completed, we now remove the duplicates from the data, retaining only the first sample in order of sample ID from each individual. The duplicated samples are listed below:

  • Participant AC130010: P17 and P29
  • Participant AC130031: P3 and P46
  • Participant AC130048: P5 and P10
  • Participant AC130084: P4 and P15
keep <- !(sample_metadata$sample_id %in% c("P17", "P3", "P5", "P4"))
sample_metadata <- sample_metadata[keep, ]
rm(sample_info, keep)

Create RGChannelSetExtended Object

Next, we create or load the RGChannelSetExtended object, depending on whether or not it’s been generated locally in the past.

# if the RGset already exists, run this code chunk instead of the one above
load("../../arsenic-epigenetics-meta/Chile/pbmcs/data/pmbcs_RGset.RData")
pheno <- pData(RGset)
sample_name <- rownames(pheno)
pheno <- data.frame(pheno, sample_name)

# adding Sentrix ID for subsequent batch correction in downstream analysis
sentrix_id <- as.factor(
  stringr::str_extract_all(pheno$sample_name, "^([0-9]{12})",
                           simplify = TRUE)
)
pheno <- data.frame(pheno, sentrix_id)

# remove features without any variation
pheno <- pheno[, apply(pheno, 2, function(x) length(unique(x)) > 1)]

Internal Quality Control

First, we check that the signal to noise ratio of each sample. The average detection p-value of the samples are very low, and so no samples are removed at this stage.

# check detection p-values
det_p <- detectionP(RGset)
pal <- brewer.pal(8, "Dark2")
barplot(
  colMeans(det_p), col = pal[factor(pheno$sample_id)], las = 2,
  cex.names = 0.8, ylab = "Mean detection p-values",
  names.arg = pheno$sample_id
)

We then produce a PDF version QC report for Illumina Infinium Human Methylation 450k arrays, which is useful for identifying failed samples.

Outlier information

Pre-Filtering

A number of functions are run sequentially on the RGset object. First, outliers are identified based on their median unmethylated/methylated intensity.

Next, champ.QC is used to perform a brief exploratory data analysis. The resulting plots are listed below.

# Create pdf of quality control plots: mdsplot, densityPlot, dendrogram, et.c
champ.QC(
  beta = getBeta(Mset), pheno = pheno$exposed, mdsPlot = TRUE,
  densityPlot = TRUE, dendrogram = TRUE, PDFplot = TRUE, Rplot = FALSE,
  Feature.sel = "None", resultsDir = "preprocessing-reports/QC_preFiltering"
)

Pre-Filtering Beta Values

Pre-Filtering MDS Plot

Pre-Filtering Dendogram

Pre-Filtering SVD

# remove uninteresting variables from SVD
rm_svd <- c("position", "sample_id", "buccal_sample_id", "study_id",
            "duplicates", "Basename", "filenames", "sample_name", "sentrix_id",
            "xMed", "yMed", "predictedSex")

# run SVD
champ.SVD(
  beta = getBeta(Mset)[complete.cases(getBeta(Mset)), ],
  pd = pheno[, -which(colnames(pheno) %in% rm_svd)],
  RGEffect = TRUE, Rplot = FALSE,
  resultsDir = "preprocessing-reports/QC_preFiltering/"
)

Post Filtering

Finally, quantitiative measures of data quality are computed, including number of beads, averaged bisulfite conversion intensity, and detection p-values of probes. Low quality samples and probes are identified, as well as the outliers based on total intensity and beta value distributions. The results are listed below. We also remove cross-reactive probes and probes within SNPs with minor allele frequencies below 5%.

# compute QC metrics
qc <- ENmix::QCinfo(RGset, distplot = FALSE)
## 0  samples with percentage of low quality CpG value greater than  0.05  or bisulfite intensity less than  7016.211 
## 5261  CpGs with percentage of low quality value greater than  0.05 
## Ploting qc_sample.jpg ...
## Done
## Ploting qc_CpG.jpg ...
## Done
## Identifying ourlier samples based on beta or total intensity values...
## After excluding low quality samples and CpGs
## 0  samples are outliers based on averaged total intensity value 
## 2  samples are outliers in beta value distribution 
## 2  outlier samples were added into badsample list
# beta: percentage of metylation
# M: log2 ratio of intensities of methylated vs. unmethylated
betas <- rmSNPandCH(
  getBeta(Mset), dist = 2, mafcut = 0.05, and = TRUE, rmcrosshyb = TRUE,
  rmXY = FALSE
)
mvals <- beta2m(betas)

# removing filtered probes from qc
qc$nbead <- qc$nbead[rownames(qc$nbead) %in% rownames(betas), ]
qc$detP <- qc$detP[rownames(qc$detP) %in% rownames(betas), ]

filtered <- champ.filter(
  beta = betas, M = mvals,
  pd = pheno, beadcount = qc$nbead, detP = qc$detP,
  arraytype = "EPIC", filterXY = TRUE, SampleCutoff = 0.05
)
##                     Failed CpG Fraction.
## 200607110021_R01C01         0.0002084504
## 200607110022_R01C01         0.0006526484
## 200607110032_R01C01         0.0006749823
## 200607110040_R01C01         0.0007568736
## 200607110021_R02C01         0.0005211261
## 200607110022_R02C01         0.0002754524
## 200607110029_R02C01         0.0009839357
## 200607110032_R02C01         0.0006030173
## 200607110033_R02C01         0.0006725008
## 200607110040_R02C01         0.0006191474
## 200607110022_R03C01         0.0002431922
## 200607110029_R03C01         0.0009380269
## 200607110032_R03C01         0.0003263242
## 200607110033_R03C01         0.0003002679
## 200607110040_R03C01         0.0005347746
## 200607110022_R04C01         0.0005384969
## 200607110029_R04C01         0.0009293415
## 200607110032_R04C01         0.0006104620
## 200607110033_R04C01         0.0003089533
## 200607110040_R04C01         0.0005707571
## 200607110022_R05C01         0.0002233397
## 200607110029_R05C01         0.0006787047
## 200607110032_R05C01         0.0003300465
## 200607110040_R05C01         0.0003672698
## 200607110021_R06C01         0.0002742116
## 200607110022_R06C01         0.0002940640
## 200607110029_R06C01         0.0008784697
## 200607110032_R06C01         0.0005893688
## 200607110033_R06C01         0.0003399727
## 200607110040_R06C01         0.0004156601
## 200607110021_R07C01         0.0002990271
## 200607110022_R07C01         0.0002791747
## 200607110029_R07C01         0.0007878930
## 200607110032_R07C01         0.0003833999
## 200607110033_R07C01         0.0003374912
## 200607110040_R07C01         0.0003300465
## 200607110021_R08C01         0.0003498989
## 200607110029_R08C01         0.0009020444
## 200607110032_R08C01         0.0008722658
## 200607110033_R08C01         0.0007332988

We next perform the same exploratory data analysis as before, but with the low-quality probes removed.

# filter sample and probes
pheno <- filtered$pd
betas <- filtered$beta
mvals <- filtered$M

filtered_CpG <- rownames(betas)
unfiltered_CpG <- rownames(getBeta(Mset))
outCpG <- unfiltered_CpG[!(unfiltered_CpG %in% filtered_CpG)]
  
# recompute the density plots
champ.QC(
  beta = betas, pheno = pheno$exposed, mdsPlot = TRUE,
  densityPlot = TRUE, dendrogram = TRUE, PDFplot = TRUE, Rplot = FALSE,
  Feature.sel = "None",
  resultsDir = "preprocessing-reports/QC_postFiltering"
)

# save the csv
write.csv(
  pheno,
  file = "../../arsenic-epigenetics-meta/Chile/pbmcs/data/pbmcs_pheno.csv"
)

Post-Filtering Beta Values

Post-Filtering MDS Plot

Post-Filtering Dendogram

Post-Filtering SVD

# run SVD
champ.SVD(
  beta = betas,
  pd = pheno[, -which(colnames(pheno) %in% rm_svd)],
  RGEffect = TRUE, Rplot = FALSE,
  resultsDir = "preprocessing-reports/QC_postFiltering/"
)

Estimate Cell Counts

We now estimate the cell-type decomposition of each sample, and plot their distributions.

# minfi requires 450k for cell composition proportions
pbmc_450k <- convertArray(read.metharray(pheno$Basename),
                            outType = "IlluminaHumanMethylation450k")

# make the plot depicting the average DNA methylation across the cell-type
# discrimating probes in both the provided and sorted data.
# The means from the provided heterogeneous samples should be within the
# range of the sorted samples.
# If the sample means fall outside the range of the sorted means,
# the cell type estimates will inflated to the closest cell type.

# which cell type?
pdf(file = "preprocessing-reports/pbmcs_meanCellCount.pdf")
estimateCellCounts(
  pbmc_450k, compositeCellType = "Blood",
  referencePlatform = "IlluminaHumanMethylation450k", returnAll = FALSE,
  meanPlot = TRUE, verbose = FALSE)
dev.off()
   
# save the composition estimates across all samples and cell types.
# returnAll = TRUE, quantile normalize the sorted data with the provided data
# to reduce these batch effects and return a GenomicMethylSet.
cell_counts <- estimateCellCounts(
  pbmc_450k, compositeCellType = "Blood",
  referencePlatform = "IlluminaHumanMethylation450k", returnAll = TRUE,
  meanPlot = FALSE, verbose = TRUE
)

Analyze Cell Counts

cell_counts <- read.csv("preprocessing-reports/cellcounts_pbmcs.csv",
                        row.names = 1)
boxplot(
  cell_counts * 100, col = seq_len(ncol(cell_counts)),
  xlab = "Cell type", ylab = "Estimated %", main = "Cell type distribution"
)

Normalization

We now normalize the data, and re-perform the exploratory data analysis to validate that the quality control measures were successful.

Funnorm Normalization

funnorm <- preprocessFunnorm(RGset)
betas_fun <- minfi::getBeta(funnorm)
betas_fun <- betas_fun[rownames(betas_fun) %in% rownames(filtered$beta), ]
newBetas <- Harman::shiftBetas(betas_fun, shiftBy = 1e-4)
mvals_fun <- B2M(betas_fun)
pheno <- pData(funnorm)

# save the processed beta and mvals, as well as pheno data
save(
  betas_fun, mvals_fun, pheno,
  file = "../../arsenic-epigenetics-meta/Chile/pbmcs/data/pbmcs_funnorm_data.RData"
)
# perform EDA
champ.QC(
  beta = betas_fun, pheno = pheno$exposed, mdsPlot = TRUE, densityPlot = TRUE,
  dendrogram = TRUE, PDFplot = TRUE, Rplot = FALSE, Feature.sel = "None",
  resultsDir = "preprocessing-reports/QC_postNormalization/"
)

champ.SVD(
  beta = betas_fun, pd = pheno[, -which(colnames(pheno) %in% rm_svd)],
  RGEffect = TRUE, Rplot = FALSE,
  resultsDir = "preprocessing-reports/QC_postNormalization/"
)

Funnorm Normalization Betas Distribution

Funnorm Normalization MDS Plot

Funnorm Normalization Dendogram

SVD Results